### clean raw datasets and merge ###

library(tidyverse)
library(lubridate)
library(readxl)
library(estimatr)
library(texreg)
library(gridExtra)
library(RColorBrewer)
library(ggpubr)
library(gt) # go get this run `remotes::install_github("rstudio/gt")`
# You might need to `install.packages("remotes")` first
library(modelsummary)
library(nnet)

clean_qualtrics <- function(data, wave_chr) {
  data %>%
    filter(Status != "Survey Preview", Finished, Q1 == "Yes") %>%
    rename(
      call_eval_mpa = Q16_1,
      call_eval_gov = Q16_2,
      call_account = Q16_3,
      responsive_eval_mpa = Q15_1,
      responsive_eval_gov = Q15_2,
      responsive_account = Q15_3,
      position = Q2,
      optimism = Q8,
      familiarity = Q9,
      call_eval_mpa_rev = Q17_1,
      call_eval_gov_rev = Q27_1,
      call_account_rev = Q28_1,
      responsive_eval_mpa_rev = Q29_1,
      responsive_eval_gov_rev = Q31_1,
      responsive_account_rev = Q32_1
    ) %>%
    mutate(
      ans_phone = ifelse(
        !is.na(Q12_1),
        Q12_1 / 100,
        Q22_1 / 100
      ),
      ans_quest = ifelse(
        !is.na(Q12_2),
        Q12_2 / 100,
        Q22_2 / 100
      ),
      familiarity = recode(
        familiarity,
        `Unfamiliar (2/4)` = 2,
        `Familiar (3/4)` = 3,
        `Very familiar (4/4)` = 4,
        `Very unfamiliar (1/4)` = 1,
        `Don’t know` = NA_real_
      ),
      optimism = case_when(
        grepl("1", optimism) ~ 1,
        grepl("2", optimism) ~ 2,
        grepl("3", optimism) ~ 3,
        grepl("4", optimism) ~ 4,
        grepl("5", optimism) ~ 5,
        TRUE ~ NA_real_
      ),
      wave = wave_chr,
      pilot_first = as.integer(grepl("PILOT", Group)),
      compliance_first = as.integer(grepl("COMPLIANCE", Group)),
      position = recode(
        position,
        `Graduate Student` = "gradstudent",
        `Graduate student` = "gradstudent",
        `Post-Doctoral Fellow` = "postdoc",
        `Faculty` = "faculty",
        `Research Manager` = "researchstaffother",
        `Research Assistant` = "researchstaffother",
        `Other Research Staff` = "researchstaffother",
        `Administrative Staff` = "researchstaffother",
        `Other` = "researchstaffother"
      )
    ) %>%
    select(wave, starts_with("call"), starts_with("responsive"), familiarity, optimism, pilot_first, compliance_first, position, starts_with("ans_"))
}

# Load UCSD data
qdat <- read_csv("../dataverse/ucsd_anon.csv") %>%
  clean_qualtrics(., "UCSD")

# Load CERP
cerpdat <- read_csv("../dataverse/cerp_anon.csv") %>%
  clean_qualtrics(., "CERP")

# US (grad students before May 22)
usdat <- read_csv("../dataverse/us_anon.csv") %>%
  mutate(recorded_date = lubridate::as_date(RecordedDate, format = "%d-%m-%y %H:%M", tz = "UTC"))

# Grad students (UCLA and Stanford)
graddat <- usdat %>%
  filter(recorded_date < dmy("22-05-19")) %>%
  clean_qualtrics(., "GradStudents")

# CP list
cpdat <- usdat %>%
  filter(recorded_date >= dmy("22-05-19")) %>%
  clean_qualtrics(., "CPList")

# Merge data together
all <- bind_rows(qdat, cerpdat, graddat, cpdat) %>%
  mutate(
    # Bin variables
    optimism_group = recode(
      optimism,
      `1` = "Pessimistic",
      `2` = "Pessimistic",
      `3` = "Neutral",
      `4` = "Optimistic",
      `5` = "Optimistic"
    ),
    familiar = as.numeric(familiarity > 2.5),
    # Accuracy of forecasts
    abs_error = 
      abs(call_eval_mpa - 0.015) +
      abs(call_eval_gov - 0.022) +
      abs(call_account - 0.001) +
      abs(responsive_eval_mpa + 0.02) +
      abs(responsive_eval_gov - 0.018) +
      abs(responsive_account + 0.007),
    # ID for long, wide reshapes
    unique_id = 1:n()
  ) %>%
  mutate_at(
    vars(starts_with("ans_")),
    ~ {ifelse(. > 1.000001, . / 100, .)}
  )
all$call_avg <- rowMeans(all[, c("call_eval_mpa", "call_eval_gov","call_account")])
all$resp_avg <- rowMeans(all[, c("responsive_account", "responsive_eval_gov", "responsive_eval_mpa")])

write_csv(all, file = "../dataverse/forecasting_clean.csv")

